The dataset was collected from the Kaggle website, and the data was created by the Starbucks store locator website. The data includes every Starbucks location in worldwide until November of 2021 with store numbers, country codes, ownership type code, operation time, latitude, longitude, address of the stores, and timezone of the stores’ locations. Starbucks operates more than 28,000 retail stores in 49 countries.
Since Starbucks is one of the largest companies of coffee and beverage, it has spread throughout the world. So, through this project, we would like to compare the number of Starbucks by country to see which countries have a high number of Starbucks and to analyze the density of Starbucks by location. We are not only analyzing worldwide but also analyzing nationwide to see which states have the most number of Starbucks.
fig <- data
fig <- fig %>%
plot_ly(
type = 'densitymapbox',
lat = ~latitude,
lon = ~longitude,
coloraxis = 'coloraxis',
radius = 10)
fig <- fig %>%
layout(
mapbox = list(
style="stamen-terrain",
center= list(lon=180)), coloraxis = list(colorscale = "Viridis"))
figa <- sort(table(data$countryCode),decreasing = TRUE)
a <- as.data.frame(a)
fig1 <- plot_ly( a, x = a$Var1,y = a$Freq,type = 'bar')
fig1 <- fig1 %>% layout(title = "Number of Starbucks in Each Countries", yaxis = list(type = 'linear'))
fig1This data shows the summary of number of the stores around the world. There is a huge gap in average and median because there are some great numbers of the stores that having low density.
data1 <- table(data$countryCode) #count the number of stores in each country
dataforsum <- as.data.frame(data1)
allsum <- summary(dataforsum$Freq)
paste("The",names(allsum), "number of Starbucks around the world is", allsum)## [1] "The Min. number of Starbucks around the world is 1"
## [2] "The 1st Qu. number of Starbucks around the world is 26"
## [3] "The Median number of Starbucks around the world is 62"
## [4] "The Mean number of Starbucks around the world is 577.326530612245"
## [5] "The 3rd Qu. number of Starbucks around the world is 317"
## [6] "The Max. number of Starbucks around the world is 15003"
These are the countries where have the most numbers of stores.
top5 <- sort(data1,decreasing = TRUE)[1:5]
top5##
## US CN JP KR CA
## 15003 2665 1665 1493 1219
fig3 <- plot_ly(y = dataforsum$Freq,type = 'box',name = "World")
fig3Since the range of outliers is huge, we divided these data into 4 categories and use the smallest outlier as the boundary of “extremely high” class.
outliers <- boxplot.stats(dataforsum$Freq)$out
Out <- which(dataforsum$Freq %in% c(outliers)) #position of outliers in the dataframe
min(dataforsum[Out,]$Freq) ## [1] 975
#set density as below:
# <= 375 low
# 375-675 moderate
# 675-975 high
# >= 975 extremely high
LtoM <- 375
MtoH <- 675
HtoE <- 975
#filter out each density
dataL <- as.data.frame(data1[data1<=LtoM])
dataM <- data1[data1>LtoM]
dataM <- as.data.frame(dataM[dataM<=MtoH])
dataH <- data1[data1>MtoH]
dataH <- as.data.frame(dataH[dataH<=HtoE])
dataE <- as.data.frame(data1[data1>HtoE])
#count each density
low <- nrow(dataL)
moderate <- nrow(dataM)
high <- nrow(dataH)
ehigh <- nrow(dataE)
#make the table to show number of each density
names(low) <- "number"
categorical <- rbind(low = low,moderate = moderate,high = high,Extremly_high = ehigh)
categorical <- data.frame(categorical)
categorical #show the table## number
## low 37
## moderate 6
## high 1
## Extremly_high 5
# plot out
x2 <- c("Low","Moderate","High","Extremely High")
fixedx2 <- factor(x2,levels = c("Low","Moderate","High","Extremely High"))
fig2 <- plot_ly(x = fixedx2 ,y = categorical$number,type = 'bar')
fig2 <- fig2 %>% layout(title = "Number of Countries in Each Level of Density")
fig2Since the most countries are in low density, we only analyzed the low density.
lsum <- summary(dataL$Freq)
lsum## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 17.00 34.00 66.76 91.00 317.00
paste("The",names(lsum), "Number of Starbucks in Low Denisity Country is", lsum)## [1] "The Min. Number of Starbucks in Low Denisity Country is 1"
## [2] "The 1st Qu. Number of Starbucks in Low Denisity Country is 17"
## [3] "The Median Number of Starbucks in Low Denisity Country is 34"
## [4] "The Mean Number of Starbucks in Low Denisity Country is 66.7567567567568"
## [5] "The 3rd Qu. Number of Starbucks in Low Denisity Country is 91"
## [6] "The Max. Number of Starbucks in Low Denisity Country is 317"
Starbucks is founded in the United States, so United States has the most number of Starbucks. The following analysis is the analyzing of the stores in the United States.
data2US <- subset(data,data$countryCode=="US")
#map of US
g <- list(
scope = 'US',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray95"),
subunitcolor = toRGB("gray85"),
countrycolor = toRGB("gray85"),
countrywidth = 0.5,
subunitwidth = 0.5
)
figMap <- plot_geo(data2US, lat = ~latitude, lon = ~longitude)
figMap <- figMap %>% add_markers(
text = ~paste(streetAddressLine1, streetAddressLine2, streetAddressLine3,
paste("Ownership Type Code:", ownershipTypeCode), sep = "<br />"),
color = ~ownershipTypeCode, symbol = I("square"), size = I(8), hoverinfo = "text"
)
figMap <- figMap %>% layout(
title = 'Starbucks in US', geo = g
)
figMap## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
data2 <- table(data2US$countrySubdivisionCode) #count the number of Starbucks in each state
data2_1 <- as.data.frame(data2)
figUSState <- plot_ly(data2_1, labels = ~Var1, values = ~Freq, type = 'pie')
figUSState <- figUSState %>% layout(title = 'Starbucks in US states',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figUSStatesize = nrow(data2US)
percentage = data2/size
percentage##
## AK AL AR AZ CA CO
## 0.0033326668 0.0067319869 0.0043991202 0.0356595348 0.1990935146 0.0329934013
## CT DC DE FL GA HI
## 0.0085316270 0.0039992002 0.0028660934 0.0533893221 0.0253282677 0.0072652136
## IA ID IL IN KS KY
## 0.0075984803 0.0051989602 0.0421915617 0.0173965207 0.0077317870 0.0083983203
## LA MA MD ME MI MN
## 0.0068652936 0.0192628141 0.0186629341 0.0021329068 0.0199293475 0.0138638939
## MO MS MT NC ND NE
## 0.0133306672 0.0025328268 0.0023995201 0.0285276278 0.0011331067 0.0043991202
## NH NJ NM NV NY OH
## 0.0020662534 0.0197293875 0.0055988802 0.0178630940 0.0431913617 0.0300606545
## OK OR PA RI SC SD
## 0.0062654136 0.0229287476 0.0273945211 0.0017996401 0.0139305472 0.0017996401
## TN TX UT VA VT WA
## 0.0161967606 0.0809838032 0.0077317870 0.0309938012 0.0007331867 0.0494567753
## WI WV WY
## 0.0120642538 0.0028660934 0.0011997600
We wondered what if we want to pick 20 stores from the United States, what the probability will be to get the store in Massachusetts.
probMA <- percentage[names(percentage)=='MA']
probMA## MA
## 0.01926281
pmfMA <- dbinom(1:20,size,probMA)
pmfMA## [1] 5.415514e-125 7.978587e-123 7.835957e-121 5.771522e-119 3.400555e-117
## [6] 1.669548e-115 7.025422e-114 2.586574e-112 8.464396e-111 2.492761e-109
## [11] 6.673344e-108 1.637529e-106 3.708884e-105 7.799809e-104 1.530849e-102
## [16] 2.816588e-101 4.877038e-100 7.975106e-99 1.235397e-97 1.817907e-96
pmfMA1 <- as.data.frame(table(pmfMA))
pmfMA1$Freq <- c(1:20)
pmfMA1## pmfMA Freq
## 1 5.41551439891428e-125 1
## 2 7.9785867495638e-123 2
## 3 7.83595654273215e-121 3
## 4 5.77152229386028e-119 4
## 5 3.40055473344313e-117 5
## 6 1.66954819093381e-115 6
## 7 7.0254217649047e-114 7
## 8 2.58657417793443e-112 8
## 9 8.46439631315475e-111 9
## 10 2.49276068739437e-109 10
## 11 6.67334432574018e-108 11
## 12 1.63752882062455e-106 12
## 13 3.70888438271496e-105 13
## 14 7.79980927940493e-104 14
## 15 1.53084942379238e-102 15
## 16 2.81658763182104e-101 16
## 17 4.87703806067497e-100 17
## 18 7.97510590708517e-99 18
## 19 1.23539743828312e-97 19
## 20 1.8179065574292e-96 20
plot(1:20,pmfMA,type="h",xlab = "20 times") #pmf plot - each probabilitypmfUS <- dbinom(1,size,percentage)
pmfUS##
## AK AL AR AZ CA
## 8.900843e-21 9.890111e-43 1.243914e-27 1.421271e-234 0.000000e+00
## CO CT DC DE FL
## 1.279466e-216 1.917541e-54 4.677119e-25 8.575103e-18 0.000000e+00
## GA HI IA ID IL
## 2.710574e-165 3.386075e-46 2.299267e-48 8.531494e-33 8.787069e-279
## IN KS KY LA MA
## 1.190861e-112 3.118191e-49 1.418571e-53 1.346607e-43 5.415514e-125
## MD ME MI MN MO
## 5.056345e-121 3.924748e-13 2.084287e-129 2.284760e-89 7.308384e-86
## MS MT NC ND NE
## 1.139618e-15 8.015981e-15 1.156424e-186 6.978291e-07 1.243914e-27
## NH NJ NM NV NY
## 1.035626e-12 4.402820e-128 2.205326e-35 9.838811e-116 1.411038e-285
## OH OK OR PA RI
## 6.256299e-197 1.056211e-39 2.570067e-149 4.369090e-179 4.961707e-11
## SC SD TN TX UT
## 8.327944e-90 4.961707e-11 9.888856e-105 0.000000e+00 3.118191e-49
## VA VT WA WI WV
## 3.454895e-203 1.831132e-04 0.000000e+00 1.505327e-77 8.575103e-18
## WY
## 2.715189e-07
The data shows that there is a huge number of standard deviation. Also, randomized the data with normal distribution.
mean <- mean(data2)
sd <- sd(data2)
mean## [1] 294.1765
sd## [1] 456.3604
#with a huge sd, what will the sampling show with US data
set.seed(9672)
# generate randomize 50 virtual states with normal distribution
normal_random <- rnorm(50, mean=mean,sd = sd);normal_random## [1] 24.13116 -549.83428 -215.24087 108.62747 -548.41357 -61.20251
## [7] 281.31549 27.02162 709.70516 307.10265 362.29587 496.83848
## [13] 407.95318 -34.59064 466.26633 851.32483 432.32215 -69.34155
## [19] 367.90005 973.81302 417.72619 652.96659 -84.05626 -55.41425
## [25] 458.09336 30.35105 203.40764 -351.24030 -1110.04093 339.71154
## [31] 401.78598 47.70098 259.01119 189.99031 -827.43038 773.14779
## [37] 541.91732 441.62913 772.49003 464.02372 -141.61672 505.97758
## [43] 274.93052 532.97957 378.04562 763.20978 -1007.53150 218.89160
## [49] -238.00785 -289.45595
hist(normal_random)This data and graph present the samples with 4 different sample size that are 50, 100, 500 and 1000 from Starbucks in the United States. The pie charts show the significant differences between each states.
ssize <- c(50,100,500,1000)
#simple random sampling
for(size in ssize){
s1 <- srswr(size,nrow(data2))
pickedrow1 <- (1:nrow(data2))[s1!=0]
pickedrow1 <- rep(pickedrow1,s1[s1!=0])
simplesample1 <- data2US[pickedrow1, ]
output <- table(simplesample1$countrySubdivisionCode);output
output1 <- as.data.frame(table(output));output1
output1$Freq<- c('CO','WA','NY');output1
cat("The simple random sample lays out with sample size", size, "is\n", names(output), "\n", output, "\n")
}## The simple random sample lays out with sample size 50 is
## CO NY WA
## 8 28 14
## The simple random sample lays out with sample size 100 is
## CO NY WA
## 7 52 41
## The simple random sample lays out with sample size 500 is
## CO NY WA
## 37 244 219
## The simple random sample lays out with sample size 1000 is
## CO NY WA
## 80 500 420
#pie
s1 <- srswr(50,nrow(data2))
pickedrow1 <- (1:nrow(data2))[s1!=0]
pickedrow1 <- rep(pickedrow1,s1[s1!=0])
simplesample1 <- data2US[pickedrow1, ]
output <- table(simplesample1$countrySubdivisionCode)
output <- as.data.frame(output)
figRandomSampling50 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figRandomSampling50 <- figRandomSampling50 %>% layout(title = 'Simple Random Sampling with Sample Size = 50', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figRandomSampling50s1 <- srswr(100,nrow(data2))
pickedrow1 <- (1:nrow(data2))[s1!=0]
pickedrow1 <- rep(pickedrow1,s1[s1!=0])
simplesample1 <- data2US[pickedrow1, ]
output <- table(simplesample1$countrySubdivisionCode)
output <- as.data.frame(output)
figRandomSampling100 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figRandomSampling100 <- figRandomSampling100 %>% layout(title = 'Simple Random Sampling with Sample Size = 100', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figRandomSampling100s1 <- srswr(500,nrow(data2))
pickedrow1 <- (1:nrow(data2))[s1!=0]
pickedrow1 <- rep(pickedrow1,s1[s1!=0])
simplesample1 <- data2US[pickedrow1, ]
output <- table(simplesample1$countrySubdivisionCode)
output <- as.data.frame(output)
figRandomSampling500 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figRandomSampling500 <- figRandomSampling500 %>% layout(title = 'Simple Random Sampling with Sample Size = 500', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figRandomSampling500s1 <- srswr(1000,nrow(data2))
pickedrow1 <- (1:nrow(data2))[s1!=0]
pickedrow1 <- rep(pickedrow1,s1[s1!=0])
simplesample1 <- data2US[pickedrow1, ]
output <- table(simplesample1$countrySubdivisionCode)
output <- as.data.frame(output)
figRandomSampling1000 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figRandomSampling1000 <- figRandomSampling1000 %>% layout(title = 'Simple Random Sampling with Sample Size = 1000', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figRandomSampling1000for(size in ssize){
N <- nrow(data2)
k <- ceiling(N/size)
r <- sample(k,1)
s <- seq(r,by=k,length = size)
systematicsample <- data2US[s, ]
output <- table(systematicsample$countrySubdivisionCode)
cat("The systematic sample layout with sample size", size, "is\n", names(output), "\n", output, "\n")
}## The systematic sample layout with sample size 50 is
## CO NC NY WA
## 25 2 13 10
## The systematic sample layout with sample size 100 is
## CO NC NY WA
## 50 4 25 21
## The systematic sample layout with sample size 500 is
## CA CO GA NC NY PA SC WA
## 104 50 50 50 125 50 50 21
## The systematic sample layout with sample size 1000 is
## AZ CA CO GA MO NC NY PA SC TX WA
## 49 430 50 50 25 50 125 50 50 100 21
#pie
N <- nrow(data2)
k <- ceiling(N/50)
r <- sample(k,1)
s <- seq(r,by=k,length = 50)
systematicsample <- data2US[s, ]
output <- table(systematicsample$countrySubdivisionCode);output##
## CO NC NY WA
## 25 2 13 10
output <- as.data.frame(output);output## Var1 Freq
## 1 CO 25
## 2 NC 2
## 3 NY 13
## 4 WA 10
figSystematicSampling50 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figSystematicSampling50 <- figSystematicSampling50 %>% layout(title = 'Systematic Sampling with Sample Size = 50', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figSystematicSampling50N <- nrow(data2)
k <- ceiling(N/100)
r <- sample(k,1)
s <- seq(r,by=k,length = 100)
systematicsample <- data2US[s, ]
output <- table(systematicsample$countrySubdivisionCode);output##
## CO NC NY WA
## 50 4 25 21
output <- as.data.frame(output);output## Var1 Freq
## 1 CO 50
## 2 NC 4
## 3 NY 25
## 4 WA 21
figSystematicSampling100 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figSystematicSampling100 <- figSystematicSampling100 %>% layout(title = 'Systematic Sampling with Sample Size = 100', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figSystematicSampling100N <- nrow(data2)
k <- ceiling(N/500)
r <- sample(k,1)
s <- seq(r,by=k,length = 500)
systematicsample <- data2US[s, ]
output <- table(systematicsample$countrySubdivisionCode);output##
## CA CO GA NC NY PA SC WA
## 104 50 50 50 125 50 50 21
output <- as.data.frame(output);output## Var1 Freq
## 1 CA 104
## 2 CO 50
## 3 GA 50
## 4 NC 50
## 5 NY 125
## 6 PA 50
## 7 SC 50
## 8 WA 21
figSystematicSampling500 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figSystematicSampling500 <- figSystematicSampling500 %>% layout(title = 'Systematic Sampling with Sample Size = 500', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figSystematicSampling500N <- nrow(data2)
k <- ceiling(N/1000)
r <- sample(k,1)
s <- seq(r,by=k,length = 1000)
systematicsample <- data2US[s, ]
output <- table(systematicsample$countrySubdivisionCode);output##
## AZ CA CO GA MO NC NY PA SC TX WA
## 49 430 50 50 25 50 125 50 50 100 21
output <- as.data.frame(output);output## Var1 Freq
## 1 AZ 49
## 2 CA 430
## 3 CO 50
## 4 GA 50
## 5 MO 25
## 6 NC 50
## 7 NY 125
## 8 PA 50
## 9 SC 50
## 10 TX 100
## 11 WA 21
figSystematicSampling1000 <- plot_ly(output, labels = ~Var1, values = ~Freq, type = 'pie')
figSystematicSampling1000 <- figSystematicSampling1000 %>% layout(title = 'Systematic Sampling with Sample Size = 1000', xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figSystematicSampling1000Throughout the analysis, we were focused on the number of locations of Starbucks in the world and the United States. Before we started analyzing the data, we expected a high volume of Starbucks in Washington since Starbucks started from Washington state. Also, we assume that one of the European countries could be the top five countries with a large number of Starbucks stores because most western people start their days with coffee. However, worldwide, it was quite surprising that Starbucks is very popular in East Asia especially in China, South Korea, and Japan. Nationwide, although Washington is the founder state of Starbucks, California has the most number of Starbucks stores in the United States.
wordclouddata <- as.data.frame(table(data2US$countrySubdivisionCode))
wordcloud2(wordclouddata)